import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from googletrans import Translator
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('Amazon Sale Report.csv')
df.T
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 128965 | 128966 | 128967 | 128968 | 128969 | 128970 | 128971 | 128972 | 128973 | 128974 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 128965 | 128966 | 128967 | 128968 | 128969 | 128970 | 128971 | 128972 | 128973 | 128974 |
| Order ID | 405-8078784-5731545 | 171-9198151-1101146 | 404-0687676-7273146 | 403-9615377-8133951 | 407-1069790-7240320 | 404-1490984-4578765 | 408-5748499-6859555 | 406-7807733-3785945 | 407-5443024-5233168 | 402-4393761-0311520 | ... | 408-5154281-4593912 | 406-9812666-2474761 | 404-5182288-1653947 | 403-7059995-7618722 | 404-3802633-7250760 | 406-6001380-7673107 | 402-9551604-7544318 | 407-9547469-3152358 | 402-6184140-0545956 | 408-7436540-8728312 |
| Date | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | 04-30-22 | ... | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 | 05-31-22 |
| Status | Cancelled | Shipped - Delivered to Buyer | Shipped | Cancelled | Shipped | Shipped | Shipped | Shipped - Delivered to Buyer | Cancelled | Shipped | ... | Cancelled | Shipped | Cancelled | Shipped | Cancelled | Shipped | Shipped | Shipped | Shipped | Shipped |
| Fulfilment | Merchant | Merchant | Amazon | Merchant | Amazon | Amazon | Amazon | Merchant | Amazon | Amazon | ... | Amazon | Amazon | Amazon | Amazon | Amazon | Amazon | Amazon | Amazon | Amazon | Amazon |
| Sales Channel | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | ... | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in |
| ship-service-level | Standard | Standard | Expedited | Standard | Expedited | Expedited | Expedited | Standard | Expedited | Expedited | ... | Expedited | Expedited | Expedited | Expedited | Expedited | Expedited | Expedited | Expedited | Expedited | Expedited |
| Style | SET389 | JNE3781 | JNE3371 | J0341 | JNE3671 | SET264 | J0095 | JNE3405 | SET200 | JNE3461 | ... | J0119 | SET224 | JNE3638 | SET264 | SET044 | JNE3697 | SET401 | J0157 | J0012 | J0003 |
| SKU | SET389-KR-NP-S | JNE3781-KR-XXXL | JNE3371-KR-XL | J0341-DR-L | JNE3671-TU-XXXL | SET264-KR-NP-XL | J0095-SET-L | JNE3405-KR-S | SET200-KR-NP-A-XXXL | JNE3461-KR-XXL | ... | J0119-TP-XXXL | SET224-KR-NP-XS | JNE3638-KR-XS | SET264-KR-NP-XL | SET044-KR-NP-M | JNE3697-KR-XL | SET401-KR-NP-M | J0157-DR-XXL | J0012-SKD-XS | J0003-SET-S |
| Category | Set | kurta | kurta | Western Dress | Top | Set | Set | kurta | Set | kurta | ... | Top | Set | kurta | Set | Set | kurta | Set | Western Dress | Set | Set |
| Size | S | 3XL | XL | L | 3XL | XL | L | S | 3XL | XXL | ... | 3XL | XS | XS | XL | M | XL | M | XXL | XS | S |
| ASIN | B09KXVBD7Z | B09K3WFS32 | B07WV4JV4D | B099NRCT7B | B098714BZP | B08YN7XDSG | B08CMHNWBN | B081WX4G4Q | B08L91ZZXN | B08B3XF5MH | ... | B08RYPRVPV | B08MXDBRK1 | B09814Q3QH | B08YN7XDSG | B07Q2RTSFB | B098112V2V | B09VC6KHX8 | B0982YZ51B | B0894Y2NJQ | B0894X27FC |
| Courier Status | NaN | Shipped | Shipped | NaN | Shipped | Shipped | Shipped | Shipped | Cancelled | Shipped | ... | Unshipped | Shipped | Cancelled | Shipped | Unshipped | Shipped | Shipped | Shipped | Shipped | Shipped |
| Qty | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| currency | INR | INR | INR | INR | INR | INR | INR | INR | NaN | INR | ... | INR | INR | NaN | INR | INR | INR | INR | INR | INR | INR |
| Amount | 647.62 | 406.0 | 329.0 | 753.33 | 574.0 | 824.0 | 653.0 | 399.0 | NaN | 363.0 | ... | 574.0 | 1132.0 | NaN | 824.0 | 612.0 | 517.0 | 999.0 | 690.0 | 1199.0 | 696.0 |
| ship-city | MUMBAI | BENGALURU | NAVI MUMBAI | PUDUCHERRY | CHENNAI | GHAZIABAD | CHANDIGARH | HYDERABAD | HYDERABAD | Chennai | ... | Prayagraj (ALLAHABAD) | CHENNAI 600042 | Kolkata | Delhi | MUMBAI | HYDERABAD | GURUGRAM | HYDERABAD | Halol | Raipur |
| ship-state | MAHARASHTRA | KARNATAKA | MAHARASHTRA | PUDUCHERRY | TAMIL NADU | UTTAR PRADESH | CHANDIGARH | TELANGANA | TELANGANA | TAMIL NADU | ... | UTTAR PRADESH | TAMIL NADU | WEST BENGAL | DELHI | MAHARASHTRA | TELANGANA | HARYANA | TELANGANA | Gujarat | CHHATTISGARH |
| ship-postal-code | 400081.0 | 560085.0 | 410210.0 | 605008.0 | 600073.0 | 201102.0 | 160036.0 | 500032.0 | 500008.0 | 600041.0 | ... | 211007.0 | 600042.0 | 700040.0 | 110053.0 | 400017.0 | 500013.0 | 122004.0 | 500049.0 | 389350.0 | 492014.0 |
| ship-country | IN | IN | IN | IN | IN | IN | IN | IN | IN | IN | ... | IN | IN | IN | IN | IN | IN | IN | IN | IN | IN |
| promotion-ids | NaN | Amazon PLCC Free-Financing Universal Merchant ... | IN Core Free Shipping 2015/04/08 23-48-5-108 | NaN | NaN | IN Core Free Shipping 2015/04/08 23-48-5-108 | IN Core Free Shipping 2015/04/08 23-48-5-108 | Amazon PLCC Free-Financing Universal Merchant ... | IN Core Free Shipping 2015/04/08 23-48-5-108 | NaN | ... | NaN | NaN | NaN | IN Core Free Shipping 2015/04/08 23-48-5-108 | NaN | NaN | IN Core Free Shipping 2015/04/08 23-48-5-108 | NaN | IN Core Free Shipping 2015/04/08 23-48-5-108 | IN Core Free Shipping 2015/04/08 23-48-5-108 |
| B2B | False | False | True | False | False | False | False | False | False | False | ... | False | False | False | False | False | False | False | False | False | False |
| fulfilled-by | Easy Ship | Easy Ship | NaN | Easy Ship | NaN | NaN | NaN | Easy Ship | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Unnamed: 22 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | False | False | False | False | False | False | False | False | False | False |
24 rows × 128975 columns
def sniff_modified(df):
with pd.option_context("display.max_colwidth", 20):
info = pd.DataFrame()
info['data type'] = df.dtypes
info['percent missing'] = df.isnull().sum()*100/len(df)
info['No. unique'] = df.apply(lambda x: len(x.unique()))
info['unique values'] = df.apply(lambda x: x.unique())
return info.sort_values('data type')
sniff_modified(df)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| B2B | bool | 0.000000 | 2 | [False, True] |
| index | int64 | 0.000000 | 128975 | [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... |
| Qty | int64 | 0.000000 | 10 | [0, 1, 2, 15, 3, 9, 13, 5, 4, 8] |
| ship-postal-code | float64 | 0.025586 | 9460 | [400081.0, 560085.0, 410210.0, 605008.0, 60007... |
| Amount | float64 | 6.043807 | 1411 | [647.62, 406.0, 329.0, 753.33, 574.0, 824.0, 6... |
| promotion-ids | object | 38.110487 | 5788 | [nan, Amazon PLCC Free-Financing Universal Mer... |
| ship-country | object | 0.025586 | 2 | [IN, nan] |
| ship-state | object | 0.025586 | 70 | [MAHARASHTRA, KARNATAKA, PUDUCHERRY, TAMIL NAD... |
| ship-city | object | 0.025586 | 8956 | [MUMBAI, BENGALURU, NAVI MUMBAI, PUDUCHERRY, C... |
| currency | object | 6.043807 | 2 | [INR, nan] |
| Courier Status | object | 5.328164 | 4 | [nan, Shipped, Cancelled, Unshipped] |
| ASIN | object | 0.000000 | 7190 | [B09KXVBD7Z, B09K3WFS32, B07WV4JV4D, B099NRCT7... |
| Size | object | 0.000000 | 11 | [S, 3XL, XL, L, XXL, XS, 6XL, M, 4XL, 5XL, Free] |
| Category | object | 0.000000 | 9 | [Set, kurta, Western Dress, Top, Ethnic Dress,... |
| SKU | object | 0.000000 | 7195 | [SET389-KR-NP-S, JNE3781-KR-XXXL, JNE3371-KR-X... |
| Style | object | 0.000000 | 1377 | [SET389, JNE3781, JNE3371, J0341, JNE3671, SET... |
| ship-service-level | object | 0.000000 | 2 | [Standard, Expedited] |
| Sales Channel | object | 0.000000 | 2 | [Amazon.in, Non-Amazon] |
| Fulfilment | object | 0.000000 | 2 | [Merchant, Amazon] |
| Status | object | 0.000000 | 13 | [Cancelled, Shipped - Delivered to Buyer, Ship... |
| Date | object | 0.000000 | 91 | [04-30-22, 04-29-22, 04-28-22, 04-27-22, 04-26... |
| Order ID | object | 0.000000 | 120378 | [405-8078784-5731545, 171-9198151-1101146, 404... |
| fulfilled-by | object | 69.546811 | 2 | [Easy Ship, nan] |
| Unnamed: 22 | object | 38.030626 | 2 | [nan, False] |
df['Date']=pd.to_datetime(df['Date'])
df.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| index | 0 | 1 | 2 | 3 | 4 |
| Order ID | 405-8078784-5731545 | 171-9198151-1101146 | 404-0687676-7273146 | 403-9615377-8133951 | 407-1069790-7240320 |
| Date | 2022-04-30 00:00:00 | 2022-04-30 00:00:00 | 2022-04-30 00:00:00 | 2022-04-30 00:00:00 | 2022-04-30 00:00:00 |
| Status | Cancelled | Shipped - Delivered to Buyer | Shipped | Cancelled | Shipped |
| Fulfilment | Merchant | Merchant | Amazon | Merchant | Amazon |
| Sales Channel | Amazon.in | Amazon.in | Amazon.in | Amazon.in | Amazon.in |
| ship-service-level | Standard | Standard | Expedited | Standard | Expedited |
| Style | SET389 | JNE3781 | JNE3371 | J0341 | JNE3671 |
| SKU | SET389-KR-NP-S | JNE3781-KR-XXXL | JNE3371-KR-XL | J0341-DR-L | JNE3671-TU-XXXL |
| Category | Set | kurta | kurta | Western Dress | Top |
| Size | S | 3XL | XL | L | 3XL |
| ASIN | B09KXVBD7Z | B09K3WFS32 | B07WV4JV4D | B099NRCT7B | B098714BZP |
| Courier Status | NaN | Shipped | Shipped | NaN | Shipped |
| Qty | 0 | 1 | 1 | 0 | 1 |
| currency | INR | INR | INR | INR | INR |
| Amount | 647.62 | 406.0 | 329.0 | 753.33 | 574.0 |
| ship-city | MUMBAI | BENGALURU | NAVI MUMBAI | PUDUCHERRY | CHENNAI |
| ship-state | MAHARASHTRA | KARNATAKA | MAHARASHTRA | PUDUCHERRY | TAMIL NADU |
| ship-postal-code | 400081.0 | 560085.0 | 410210.0 | 605008.0 | 600073.0 |
| ship-country | IN | IN | IN | IN | IN |
| promotion-ids | NaN | Amazon PLCC Free-Financing Universal Merchant ... | IN Core Free Shipping 2015/04/08 23-48-5-108 | NaN | NaN |
| B2B | False | False | True | False | False |
| fulfilled-by | Easy Ship | Easy Ship | NaN | Easy Ship | NaN |
| Unnamed: 22 | NaN | NaN | NaN | NaN | NaN |
df.isnull().sum()
index 0 Order ID 0 Date 0 Status 0 Fulfilment 0 Sales Channel 0 ship-service-level 0 Style 0 SKU 0 Category 0 Size 0 ASIN 0 Courier Status 6872 Qty 0 currency 7795 Amount 7795 ship-city 33 ship-state 33 ship-postal-code 33 ship-country 33 promotion-ids 49153 B2B 0 fulfilled-by 89698 Unnamed: 22 49050 dtype: int64
columns_to_delete = ['Unnamed: 22', 'fulfilled-by', 'promotion-ids', 'ship-postal-code', 'currency', 'ship-country','Sales Channel ', 'index', 'SKU']
df = df.drop(columns=columns_to_delete)
sniff_modified(df)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| Date | datetime64[ns] | 0.000000 | 91 | [2022-04-30 00:00:00, 2022-04-29 00:00:00, 202... |
| B2B | bool | 0.000000 | 2 | [False, True] |
| Qty | int64 | 0.000000 | 10 | [0, 1, 2, 15, 3, 9, 13, 5, 4, 8] |
| Amount | float64 | 6.043807 | 1411 | [647.62, 406.0, 329.0, 753.33, 574.0, 824.0, 6... |
| Order ID | object | 0.000000 | 120378 | [405-8078784-5731545, 171-9198151-1101146, 404... |
| Status | object | 0.000000 | 13 | [Cancelled, Shipped - Delivered to Buyer, Ship... |
| Fulfilment | object | 0.000000 | 2 | [Merchant, Amazon] |
| ship-service-level | object | 0.000000 | 2 | [Standard, Expedited] |
| Style | object | 0.000000 | 1377 | [SET389, JNE3781, JNE3371, J0341, JNE3671, SET... |
| Category | object | 0.000000 | 9 | [Set, kurta, Western Dress, Top, Ethnic Dress,... |
| Size | object | 0.000000 | 11 | [S, 3XL, XL, L, XXL, XS, 6XL, M, 4XL, 5XL, Free] |
| ASIN | object | 0.000000 | 7190 | [B09KXVBD7Z, B09K3WFS32, B07WV4JV4D, B099NRCT7... |
| Courier Status | object | 5.328164 | 4 | [nan, Shipped, Cancelled, Unshipped] |
| ship-city | object | 0.025586 | 8956 | [MUMBAI, BENGALURU, NAVI MUMBAI, PUDUCHERRY, C... |
| ship-state | object | 0.025586 | 70 | [MAHARASHTRA, KARNATAKA, PUDUCHERRY, TAMIL NAD... |
df.shape
(128975, 15)
df_cleaned_rows = df.dropna()
sniff_modified(df_cleaned_rows)
| data type | percent missing | No. unique | unique values | |
|---|---|---|---|---|
| Date | datetime64[ns] | 0.0 | 91 | [2022-04-30 00:00:00, 2022-04-29 00:00:00, 202... |
| B2B | bool | 0.0 | 2 | [False, True] |
| Qty | int64 | 0.0 | 6 | [1, 2, 3, 5, 4, 8] |
| Amount | float64 | 0.0 | 867 | [406.0, 329.0, 574.0, 824.0, 653.0, 399.0, 363... |
| Order ID | object | 0.0 | 108271 | [171-9198151-1101146, 404-0687676-7273146, 407... |
| Status | object | 0.0 | 12 | [Shipped - Delivered to Buyer, Shipped, Cancel... |
| Fulfilment | object | 0.0 | 2 | [Merchant, Amazon] |
| ship-service-level | object | 0.0 | 2 | [Standard, Expedited] |
| Style | object | 0.0 | 1371 | [JNE3781, JNE3371, JNE3671, SET264, J0095, JNE... |
| Category | object | 0.0 | 9 | [kurta, Top, Set, Western Dress, Ethnic Dress,... |
| Size | object | 0.0 | 11 | [3XL, XL, L, S, XXL, XS, 6XL, M, 4XL, 5XL, Free] |
| ASIN | object | 0.0 | 7128 | [B09K3WFS32, B07WV4JV4D, B098714BZP, B08YN7XDS... |
| Courier Status | object | 0.0 | 2 | [Shipped, Unshipped] |
| ship-city | object | 0.0 | 8499 | [BENGALURU, NAVI MUMBAI, CHENNAI, GHAZIABAD, C... |
| ship-state | object | 0.0 | 67 | [KARNATAKA, MAHARASHTRA, TAMIL NADU, UTTAR PRA... |
df_cleaned_rows.shape
(116016, 15)
## !pip install googletrans==4.0.0-rc1
translator = Translator()
def translate_to_english(text):
if text.isascii():
return text
else:
translation = translator.translate(text, src='hi', dest='en')
return translation.text
df_cleaned_rows['ship-city'] = df_cleaned_rows['ship-city'].apply(translate_to_english)
df_cleaned_rows['ship-city']=df_cleaned_rows['ship-city'].str.capitalize()
df_cleaned_rows['ship-city']=df_cleaned_rows['ship-city'].replace({'Guwahati, kamrup (m)': 'Guwahati',
'Kolkata 700034':'Kolkata',
'Dhamnod dhar district': 'Dhamnod dhar',
'Nk sweets vikasnagar':'vikasnagar',
'Mandvi kachchh district':'Mandvi kachchh',
'Andul near maya stores':'Andul',
'district': '',
'Neelakudi, thiruvarur': 'Neelakudi',
'Arvi,dist- wardha':'Arvi',
'Mumbai 400101': 'Mumbai',
'Kamatgi,hunugund taluk,bagalkot district':'bagalkot',
'extension': '',
'Durganagar ,nimta south':'Durganagar',
'and ':'',
'Mumbai,malad west,malvani.':'malvani',
'Hassan (amazon arun)':'Hassan',
'Phanigiri road,chaitanyapuri,hyderabad':'chaitanyapuri',
'Town.budhana village. husainpur kalan':'husainpur',
'New delhi-110075':'New delhi',
'Mumbai -400064':'Mumbai',
'Mumbai 400023':'Mumbai',
'Post office-harrawala, dehradun':'dehradun',
'Nuvem. ( do not ring the door bell, call b4 comg':'Nuvem',
'Tehsil - sikandrabad, district - bulandshahr':'bulandshahr',
'Mumbai 400057':'Mumbai',
'Pune-412207':'Pune'
})
df_cleaned_rows['ship-city'].unique()
array(['Bengaluru', 'Navi mumbai', 'Chennai', ...,
'Vaishali nagar nagpur', 'Gulabpura, distt bhilwara',
'Prayagraj (allahabad)'], dtype=object)
df_cleaned_rows['ship-state']=df_cleaned_rows['ship-state'].str.capitalize()
df_cleaned_rows['ship-state']=df_cleaned_rows['ship-state'].replace({'Punjab/mohali/zirakpur': 'Punjab',
'rajshthan': 'Rajasthan',
'Puducherry':'Pondicherry',
'Pb': 'Punjab',
'Ar': 'Arunachal pradesh',
'Nl': 'Nagaland',
'Rj': 'Rajasthan',
'Orissa':'Odisha',
'Rajsthan': 'Rajasthan',
'New delhi': 'Delhi'
})
df_cleaned_rows['ship-state'].unique()
array(['Karnataka', 'Maharashtra', 'Tamil nadu', 'Uttar pradesh',
'Chandigarh', 'Telangana', 'Andhra pradesh', 'Rajasthan', 'Delhi',
'Haryana', 'Assam', 'Jharkhand', 'Chhattisgarh', 'Odisha',
'Kerala', 'Madhya pradesh', 'West bengal', 'Nagaland', 'Gujarat',
'Uttarakhand', 'Bihar', 'Jammu & kashmir', 'Punjab',
'Himachal pradesh', 'Arunachal pradesh', 'Manipur', 'Goa',
'Meghalaya', 'Pondicherry', 'Tripura', 'Ladakh', 'Dadra and nagar',
'Sikkim', 'Andaman & nicobar ', 'Rajshthan', 'Mizoram',
'Lakshadweep'], dtype=object)
df_cleaned_rows['Order ID'] = df_cleaned_rows['Order ID'].apply(lambda x: re.sub(r'\D', '', str(x)))
df_cleaned_rows['Order ID'] = pd.to_numeric(df_cleaned_rows['Order ID'], errors='coerce')
df_cleaned_rows.shape
(116016, 15)
# df_cleaned_rows.to_excel('marketing.xlsx')
top5_size = df_cleaned_rows.groupby('Size')['Qty'].sum().sort_values(ascending = False).head(5)
top5_size
Size M 20422 L 19963 XL 18898 XXL 16487 S 15309 Name: Qty, dtype: int64
top5_city = df_cleaned_rows.groupby('ship-state')['Amount'].sum().sort_values(ascending = False).head(5)
top5_city
ship-state Maharashtra 12864511.0 Karnataka 10153100.0 Telangana 6642955.0 Uttar pradesh 6494393.0 Tamil nadu 6241913.0 Name: Amount, dtype: float64
# df_cleaned_rows.to_csv('output1.csv', index=False)
categorical_cols= ['B2B','Fulfilment','ship-service-level','Courier Status', 'Size', 'Status', 'Category', 'Qty', 'Style','ASIN','ship-city','ship-state' ]
categorical_cols
['B2B', 'Fulfilment', 'ship-service-level', 'Courier Status', 'Size', 'Status', 'Category', 'Qty', 'Style', 'ASIN', 'ship-city', 'ship-state']
all_cols = ['Order ID','Amount']
all_cols
['Order ID', 'Amount']
for col in all_cols:
fig, ax = plt.subplots(figsize=(10, 6))
df_cleaned_rows[col].hist(bins=10)
plt.title(f'Histogram of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()
print(df_cleaned_rows[col].mean())
print(df_cleaned_rows[col].median())
3.768420902031196e+16 4.050856516269754e+16
649.7808491932147 606.0
for col in categorical_cols:
plt.figure(figsize=(15, 6))
# Plot only the top 10 categories for better readability
top_n = 20
df_cleaned_rows[col].value_counts().nlargest(top_n).plot(kind='bar', color='skyblue')
plt.title(f'Bar Plot of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.xticks(rotation=90)
plt.show()
dominating_category = df_cleaned_rows[col].value_counts().idxmax()
print(f"In column '{col}', the dominating category is '{dominating_category}' with {df_cleaned_rows[col].value_counts().max()} occurrences.\n")
In column 'B2B', the dominating category is 'False' with 115191 occurrences.
In column 'Fulfilment', the dominating category is 'Amazon' with 83621 occurrences.
In column 'ship-service-level', the dominating category is 'Expedited' with 82705 occurrences.
In column 'Courier Status', the dominating category is 'Shipped' with 109461 occurrences.
In column 'Size', the dominating category is 'M' with 20339 occurrences.
In column 'Status', the dominating category is 'Shipped' with 77580 occurrences.
In column 'Category', the dominating category is 'Set' with 45079 occurrences.
In column 'Qty', the dominating category is '1' with 115637 occurrences.
In column 'Style', the dominating category is 'JNE3797' with 3676 occurrences.
In column 'ASIN', the dominating category is 'B09SDXFFQ1' with 660 occurrences.
In column 'ship-city', the dominating category is 'Bengaluru' with 11004 occurrences.
In column 'ship-state', the dominating category is 'Maharashtra' with 20273 occurrences.
df_cleaned_rows['RecencyDate'] = df_cleaned_rows['Date']
current_date = df_cleaned_rows['Date'].max()
rfm_data = df_cleaned_rows.groupby('ship-state').agg({
'RecencyDate': lambda x: (current_date - x.max()).days, # Recency
'Date': 'count', # Frequency
'Amount': 'sum' # Monetary
}).rename(columns={
'RecencyDate': 'Recency',
'Date': 'Frequency',
'Amount': 'Monetary'
})
rfm_data
| Recency | Frequency | Monetary | |
|---|---|---|---|
| ship-state | |||
| Andaman & nicobar | 0 | 223 | 146093.0 |
| Andhra pradesh | 0 | 4795 | 3049151.0 |
| Arunachal pradesh | 1 | 135 | 94865.0 |
| Assam | 0 | 1482 | 976951.0 |
| Bihar | 0 | 1868 | 1351529.0 |
| Chandigarh | 0 | 304 | 203354.0 |
| Chhattisgarh | 0 | 820 | 541811.0 |
| Dadra and nagar | 3 | 58 | 39276.0 |
| Delhi | 0 | 6385 | 4231084.0 |
| Goa | 0 | 1051 | 622220.0 |
| Gujarat | 0 | 3971 | 2618903.0 |
| Haryana | 0 | 4043 | 2791446.0 |
| Himachal pradesh | 1 | 675 | 470548.0 |
| Jammu & kashmir | 0 | 614 | 432712.0 |
| Jharkhand | 0 | 1279 | 868953.0 |
| Karnataka | 0 | 15844 | 10153100.0 |
| Kerala | 0 | 5793 | 3618505.0 |
| Ladakh | 1 | 40 | 36737.0 |
| Lakshadweep | 23 | 3 | 2441.0 |
| Madhya pradesh | 0 | 2258 | 1525537.0 |
| Maharashtra | 0 | 20273 | 12864511.0 |
| Manipur | 1 | 294 | 210922.0 |
| Meghalaya | 0 | 184 | 111234.0 |
| Mizoram | 1 | 67 | 38503.0 |
| Nagaland | 0 | 172 | 140129.0 |
| Odisha | 0 | 1904 | 1324830.0 |
| Pondicherry | 0 | 308 | 182198.0 |
| Punjab | 0 | 1732 | 1163716.0 |
| Rajasthan | 0 | 2424 | 1682199.0 |
| Rajshthan | 61 | 2 | 1126.0 |
| Sikkim | 0 | 182 | 134847.0 |
| Tamil nadu | 0 | 10345 | 6241913.0 |
| Telangana | 0 | 10208 | 6642955.0 |
| Tripura | 0 | 134 | 86799.0 |
| Uttar pradesh | 0 | 9462 | 6494393.0 |
| Uttarakhand | 0 | 1383 | 932314.0 |
| West bengal | 0 | 5301 | 3357170.0 |
scaler = StandardScaler()
normalized_rfm = pd.DataFrame(scaler.fit_transform(rfm_data), columns=rfm_data.columns)
print(normalized_rfm)
Recency Frequency Monetary 0 -0.238096 -0.631746 -0.642679 1 -0.238096 0.359937 0.343783 2 -0.142340 -0.650833 -0.660086 3 -0.238096 -0.358664 -0.360353 4 -0.238096 -0.274940 -0.233071 5 -0.238096 -0.614177 -0.623222 6 -0.238096 -0.502255 -0.508214 7 0.049172 -0.667535 -0.678976 8 -0.238096 0.704813 0.745406 9 -0.238096 -0.452150 -0.480891 10 -0.238096 0.181208 0.197585 11 -0.238096 0.196825 0.256215 12 -0.142340 -0.533706 -0.532429 13 -0.238096 -0.546937 -0.545286 14 -0.238096 -0.402696 -0.397050 15 -0.238096 2.756503 2.757714 16 -0.238096 0.576406 0.537251 17 -0.142340 -0.671439 -0.679838 18 1.964292 -0.679465 -0.691492 19 -0.238096 -0.190347 -0.173942 20 -0.238096 3.717168 3.679054 21 -0.142340 -0.616346 -0.620650 22 -0.238096 -0.640205 -0.654524 23 -0.142340 -0.665583 -0.679238 24 -0.238096 -0.642808 -0.644706 25 -0.238096 -0.267131 -0.242143 26 -0.238096 -0.613309 -0.630411 27 -0.238096 -0.304439 -0.296890 28 -0.238096 -0.154341 -0.120709 29 5.603020 -0.679682 -0.691939 30 -0.238096 -0.640639 -0.646501 31 -0.238096 1.563751 1.428688 32 -0.238096 1.534035 1.564962 33 -0.238096 -0.651050 -0.662827 34 -0.238096 1.372225 1.514481 35 -0.238096 -0.380138 -0.375520 36 -0.238096 0.469690 0.448449
rfm_data
| Recency | Frequency | Monetary | |
|---|---|---|---|
| ship-state | |||
| Andaman & nicobar | 0 | 223 | 146093.0 |
| Andhra pradesh | 0 | 4795 | 3049151.0 |
| Arunachal pradesh | 1 | 135 | 94865.0 |
| Assam | 0 | 1482 | 976951.0 |
| Bihar | 0 | 1868 | 1351529.0 |
| Chandigarh | 0 | 304 | 203354.0 |
| Chhattisgarh | 0 | 820 | 541811.0 |
| Dadra and nagar | 3 | 58 | 39276.0 |
| Delhi | 0 | 6385 | 4231084.0 |
| Goa | 0 | 1051 | 622220.0 |
| Gujarat | 0 | 3971 | 2618903.0 |
| Haryana | 0 | 4043 | 2791446.0 |
| Himachal pradesh | 1 | 675 | 470548.0 |
| Jammu & kashmir | 0 | 614 | 432712.0 |
| Jharkhand | 0 | 1279 | 868953.0 |
| Karnataka | 0 | 15844 | 10153100.0 |
| Kerala | 0 | 5793 | 3618505.0 |
| Ladakh | 1 | 40 | 36737.0 |
| Lakshadweep | 23 | 3 | 2441.0 |
| Madhya pradesh | 0 | 2258 | 1525537.0 |
| Maharashtra | 0 | 20273 | 12864511.0 |
| Manipur | 1 | 294 | 210922.0 |
| Meghalaya | 0 | 184 | 111234.0 |
| Mizoram | 1 | 67 | 38503.0 |
| Nagaland | 0 | 172 | 140129.0 |
| Odisha | 0 | 1904 | 1324830.0 |
| Pondicherry | 0 | 308 | 182198.0 |
| Punjab | 0 | 1732 | 1163716.0 |
| Rajasthan | 0 | 2424 | 1682199.0 |
| Rajshthan | 61 | 2 | 1126.0 |
| Sikkim | 0 | 182 | 134847.0 |
| Tamil nadu | 0 | 10345 | 6241913.0 |
| Telangana | 0 | 10208 | 6642955.0 |
| Tripura | 0 | 134 | 86799.0 |
| Uttar pradesh | 0 | 9462 | 6494393.0 |
| Uttarakhand | 0 | 1383 | 932314.0 |
| West bengal | 0 | 5301 | 3357170.0 |
wcss = []
for i in range(1, 7):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(normalized_rfm)
wcss.append(kmeans.inertia_)
# Plot the Elbow method
plt.figure(figsize=(10,5))
plt.plot(range(1, 7), wcss, marker='o', linestyle='--')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('K-means clustering Elbow Method')
plt.show()
optimal_clusters = 2
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', random_state=42)
clusters = kmeans.fit_predict(normalized_rfm)
rfm_data['Cluster'] = clusters
kmeans = KMeans(n_clusters = 2)
y_kmeans = kmeans.fit_predict(normalized_rfm)
rfm_data['Cluster_ID'] = y_kmeans
rfm_data
| Recency | Frequency | Monetary | Cluster | Cluster_ID | |
|---|---|---|---|---|---|
| ship-state | |||||
| Andaman & nicobar | 0 | 223 | 146093.0 | 0 | 1 |
| Andhra pradesh | 0 | 4795 | 3049151.0 | 0 | 1 |
| Arunachal pradesh | 1 | 135 | 94865.0 | 0 | 1 |
| Assam | 0 | 1482 | 976951.0 | 0 | 1 |
| Bihar | 0 | 1868 | 1351529.0 | 0 | 1 |
| Chandigarh | 0 | 304 | 203354.0 | 0 | 1 |
| Chhattisgarh | 0 | 820 | 541811.0 | 0 | 1 |
| Dadra and nagar | 3 | 58 | 39276.0 | 0 | 1 |
| Delhi | 0 | 6385 | 4231084.0 | 0 | 1 |
| Goa | 0 | 1051 | 622220.0 | 0 | 1 |
| Gujarat | 0 | 3971 | 2618903.0 | 0 | 1 |
| Haryana | 0 | 4043 | 2791446.0 | 0 | 1 |
| Himachal pradesh | 1 | 675 | 470548.0 | 0 | 1 |
| Jammu & kashmir | 0 | 614 | 432712.0 | 0 | 1 |
| Jharkhand | 0 | 1279 | 868953.0 | 0 | 1 |
| Karnataka | 0 | 15844 | 10153100.0 | 1 | 0 |
| Kerala | 0 | 5793 | 3618505.0 | 0 | 1 |
| Ladakh | 1 | 40 | 36737.0 | 0 | 1 |
| Lakshadweep | 23 | 3 | 2441.0 | 0 | 1 |
| Madhya pradesh | 0 | 2258 | 1525537.0 | 0 | 1 |
| Maharashtra | 0 | 20273 | 12864511.0 | 1 | 0 |
| Manipur | 1 | 294 | 210922.0 | 0 | 1 |
| Meghalaya | 0 | 184 | 111234.0 | 0 | 1 |
| Mizoram | 1 | 67 | 38503.0 | 0 | 1 |
| Nagaland | 0 | 172 | 140129.0 | 0 | 1 |
| Odisha | 0 | 1904 | 1324830.0 | 0 | 1 |
| Pondicherry | 0 | 308 | 182198.0 | 0 | 1 |
| Punjab | 0 | 1732 | 1163716.0 | 0 | 1 |
| Rajasthan | 0 | 2424 | 1682199.0 | 0 | 1 |
| Rajshthan | 61 | 2 | 1126.0 | 0 | 1 |
| Sikkim | 0 | 182 | 134847.0 | 0 | 1 |
| Tamil nadu | 0 | 10345 | 6241913.0 | 1 | 0 |
| Telangana | 0 | 10208 | 6642955.0 | 1 | 0 |
| Tripura | 0 | 134 | 86799.0 | 0 | 1 |
| Uttar pradesh | 0 | 9462 | 6494393.0 | 1 | 0 |
| Uttarakhand | 0 | 1383 | 932314.0 | 0 | 1 |
| West bengal | 0 | 5301 | 3357170.0 | 0 | 1 |
facet = sns.lmplot(rfm_data, x='Frequency', y='Monetary', hue='Cluster_ID',
fit_reg=False, legend=True, legend_out=True)
import plotly.express as px
fig = px.scatter_3d(rfm_data, x='Frequency', y='Monetary', z='Recency', color='Cluster_ID', hover_name = rfm_data.index )
fig.show()
rfm_data[rfm_data['Cluster_ID'] == 1]
| Recency | Frequency | Monetary | Cluster | Cluster_ID | |
|---|---|---|---|---|---|
| ship-state | |||||
| Andaman & nicobar | 0 | 223 | 146093.0 | 0 | 1 |
| Andhra pradesh | 0 | 4795 | 3049151.0 | 0 | 1 |
| Arunachal pradesh | 1 | 135 | 94865.0 | 0 | 1 |
| Assam | 0 | 1482 | 976951.0 | 0 | 1 |
| Bihar | 0 | 1868 | 1351529.0 | 0 | 1 |
| Chandigarh | 0 | 304 | 203354.0 | 0 | 1 |
| Chhattisgarh | 0 | 820 | 541811.0 | 0 | 1 |
| Dadra and nagar | 3 | 58 | 39276.0 | 0 | 1 |
| Delhi | 0 | 6385 | 4231084.0 | 0 | 1 |
| Goa | 0 | 1051 | 622220.0 | 0 | 1 |
| Gujarat | 0 | 3971 | 2618903.0 | 0 | 1 |
| Haryana | 0 | 4043 | 2791446.0 | 0 | 1 |
| Himachal pradesh | 1 | 675 | 470548.0 | 0 | 1 |
| Jammu & kashmir | 0 | 614 | 432712.0 | 0 | 1 |
| Jharkhand | 0 | 1279 | 868953.0 | 0 | 1 |
| Kerala | 0 | 5793 | 3618505.0 | 0 | 1 |
| Ladakh | 1 | 40 | 36737.0 | 0 | 1 |
| Lakshadweep | 23 | 3 | 2441.0 | 0 | 1 |
| Madhya pradesh | 0 | 2258 | 1525537.0 | 0 | 1 |
| Manipur | 1 | 294 | 210922.0 | 0 | 1 |
| Meghalaya | 0 | 184 | 111234.0 | 0 | 1 |
| Mizoram | 1 | 67 | 38503.0 | 0 | 1 |
| Nagaland | 0 | 172 | 140129.0 | 0 | 1 |
| Odisha | 0 | 1904 | 1324830.0 | 0 | 1 |
| Pondicherry | 0 | 308 | 182198.0 | 0 | 1 |
| Punjab | 0 | 1732 | 1163716.0 | 0 | 1 |
| Rajasthan | 0 | 2424 | 1682199.0 | 0 | 1 |
| Rajshthan | 61 | 2 | 1126.0 | 0 | 1 |
| Sikkim | 0 | 182 | 134847.0 | 0 | 1 |
| Tripura | 0 | 134 | 86799.0 | 0 | 1 |
| Uttarakhand | 0 | 1383 | 932314.0 | 0 | 1 |
| West bengal | 0 | 5301 | 3357170.0 | 0 | 1 |
label_encoder = LabelEncoder()
df_cleaned_rows[categorical_cols]=df_cleaned_rows[categorical_cols].apply(LabelEncoder().fit_transform)
min_max_scaler = MinMaxScaler()
df_cleaned_rows[all_cols] = min_max_scaler.fit_transform(df_cleaned_rows[all_cols])
df_cleaned_rows
| Order ID | Date | Status | Fulfilment | ship-service-level | Style | Category | Size | ASIN | Courier Status | Qty | Amount | ship-city | ship-state | B2B | RecencyDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.003865 | 2022-04-30 | 5 | 1 | 1 | 845 | 8 | 0 | 5229 | 0 | 0 | 0.072708 | 764 | 15 | 0 | 2022-04-30 |
| 2 | 0.979281 | 2022-04-30 | 3 | 0 | 0 | 531 | 8 | 8 | 666 | 0 | 0 | 0.058918 | 4412 | 20 | 1 | 2022-04-30 |
| 4 | 0.992046 | 2022-04-30 | 3 | 0 | 0 | 752 | 6 | 0 | 4308 | 0 | 0 | 0.102794 | 1237 | 31 | 0 | 2022-04-30 |
| 5 | 0.979618 | 2022-04-30 | 3 | 0 | 0 | 1230 | 5 | 8 | 2904 | 0 | 0 | 0.147564 | 2054 | 34 | 0 | 2022-04-30 |
| 6 | 0.998214 | 2022-04-30 | 3 | 0 | 0 | 180 | 5 | 5 | 1656 | 0 | 0 | 0.116941 | 1155 | 5 | 0 | 2022-04-30 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 128970 | 0.989917 | 2022-05-31 | 3 | 0 | 0 | 772 | 8 | 8 | 3920 | 0 | 0 | 0.092586 | 2405 | 32 | 0 | 2022-05-31 |
| 128971 | 0.974602 | 2022-05-31 | 3 | 0 | 0 | 1346 | 5 | 6 | 6780 | 0 | 0 | 0.178904 | 2209 | 11 | 0 | 2022-05-31 |
| 128972 | 0.995608 | 2022-05-31 | 3 | 0 | 0 | 236 | 7 | 10 | 4031 | 0 | 0 | 0.123567 | 2405 | 32 | 0 | 2022-05-31 |
| 128973 | 0.973187 | 2022-05-31 | 3 | 0 | 0 | 119 | 5 | 9 | 1300 | 0 | 0 | 0.214721 | 2257 | 10 | 0 | 2022-05-31 |
| 128974 | 0.998923 | 2022-05-31 | 3 | 0 | 0 | 110 | 5 | 7 | 1258 | 0 | 0 | 0.124642 | 5309 | 6 | 0 | 2022-05-31 |
116016 rows × 16 columns
id_col= ['Order ID']
drop_col= ['Courier Status','B2B','Date','RecencyDate']
target_col = ['Status']
X = df_cleaned_rows.drop(target_col, axis=1)
X = df_cleaned_rows.drop(drop_col, axis=1)
y=df_cleaned_rows[target_col]
X.shape, y.shape
((116016, 12), (116016, 1))
X
| Order ID | Status | Fulfilment | ship-service-level | Style | Category | Size | ASIN | Qty | Amount | ship-city | ship-state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.003865 | 5 | 1 | 1 | 845 | 8 | 0 | 5229 | 0 | 0.072708 | 764 | 15 |
| 2 | 0.979281 | 3 | 0 | 0 | 531 | 8 | 8 | 666 | 0 | 0.058918 | 4412 | 20 |
| 4 | 0.992046 | 3 | 0 | 0 | 752 | 6 | 0 | 4308 | 0 | 0.102794 | 1237 | 31 |
| 5 | 0.979618 | 3 | 0 | 0 | 1230 | 5 | 8 | 2904 | 0 | 0.147564 | 2054 | 34 |
| 6 | 0.998214 | 3 | 0 | 0 | 180 | 5 | 5 | 1656 | 0 | 0.116941 | 1155 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 128970 | 0.989917 | 3 | 0 | 0 | 772 | 8 | 8 | 3920 | 0 | 0.092586 | 2405 | 32 |
| 128971 | 0.974602 | 3 | 0 | 0 | 1346 | 5 | 6 | 6780 | 0 | 0.178904 | 2209 | 11 |
| 128972 | 0.995608 | 3 | 0 | 0 | 236 | 7 | 10 | 4031 | 0 | 0.123567 | 2405 | 32 |
| 128973 | 0.973187 | 3 | 0 | 0 | 119 | 5 | 9 | 1300 | 0 | 0.214721 | 2257 | 10 |
| 128974 | 0.998923 | 3 | 0 | 0 | 110 | 5 | 7 | 1258 | 0 | 0.124642 | 5309 | 6 |
116016 rows × 12 columns
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.20)
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('\nClassification Report:')
print(classification_rep)
Accuracy: 0.6757
Classification Report:
precision recall f1-score support
0 0.00 0.00 0.00 1095
1 0.00 0.00 0.00 153
2 0.00 0.00 0.00 60
3 0.68 1.00 0.81 15679
4 0.00 0.00 0.00 1
5 0.00 0.00 0.00 5566
7 0.00 0.00 0.00 5
8 0.00 0.00 0.00 205
9 0.00 0.00 0.00 5
10 0.00 0.00 0.00 405
11 0.00 0.00 0.00 30
accuracy 0.68 23204
macro avg 0.06 0.09 0.07 23204
weighted avg 0.46 0.68 0.54 23204
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('\nClassification Report:')
print(classification_rep)
Accuracy: 0.9994
Classification Report:
precision recall f1-score support
0 1.00 1.00 1.00 1095
1 1.00 1.00 1.00 153
2 1.00 1.00 1.00 60
3 1.00 1.00 1.00 15679
4 0.00 0.00 0.00 1
5 1.00 1.00 1.00 5566
7 1.00 0.40 0.57 5
8 0.96 1.00 0.98 205
9 0.00 0.00 0.00 5
10 0.99 1.00 0.99 405
11 1.00 0.80 0.89 30
accuracy 1.00 23204
macro avg 0.81 0.75 0.77 23204
weighted avg 1.00 1.00 1.00 23204